1.a Yhat = 41.5 + (-0.12 * x1) + (0.05 * x2) + (2.87 * x3) + (-18.26 * x4) + (3.67 * x5) + ( -1.52* x6) + (0.28 * x7) + (-0.01 * x8) + (-0.93 * x9) + (-0.55 * x210)

summary(lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data=Boston))
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + lstat, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.1814  -2.7625  -0.6243   1.8448  26.3920 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  41.451747   4.903283   8.454 3.18e-16 ***
## crim         -0.121665   0.032919  -3.696 0.000244 ***
## zn            0.046191   0.013673   3.378 0.000787 ***
## chas          2.871873   0.862591   3.329 0.000935 ***
## nox         -18.262427   3.565247  -5.122 4.33e-07 ***
## rm            3.672957   0.409127   8.978  < 2e-16 ***
## dis          -1.515951   0.187675  -8.078 5.08e-15 ***
## rad           0.283932   0.063945   4.440 1.11e-05 ***
## tax          -0.012292   0.003407  -3.608 0.000340 ***
## ptratio      -0.930961   0.130423  -7.138 3.39e-12 ***
## lstat        -0.546509   0.047442 -11.519  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.789 on 495 degrees of freedom
## Multiple R-squared:  0.7342, Adjusted R-squared:  0.7289 
## F-statistic: 136.8 on 10 and 495 DF,  p-value: < 2.2e-16
datatable1 = matrix(c(41.451747, 4.903283, 8.454, '3.18e-16', -0.121665, 0.032919, -3.696, 0.000244, 0.046191, 0.013673, 3.378, 0.000787, 2.871873, 0.862591, 3.329, 0.000935, -18.262427, 3.565247, -5.122, '4.33e-07', 3.672957, 0.409127, 8.978, '2e-16', -1.515951, 0.187675, -8.078, '5.08e-15', 0.283932, 0.063945, 4.440, '1.11e-05', -0.012292, 0.003407, -3.608, 0.000340, -0.930961, 0.130423, -7.138, '3.39e-12', -0.546509, 0.047442, -11.519, '2e-16'), ncol=4, byrow=TRUE)
colnames(datatable1) = c('Estimate', 'Std. Error', 't value', 'Pr(>|t|)')
rownames(datatable1) <- c('intercept', 'zn','chas','nox','rm' ,'dis','rad','tax','ptratio','lstat','medv')
as.table(datatable1)
##           Estimate   Std. Error t value Pr(>|t|)
## intercept 41.451747  4.903283   8.454   3.18e-16
## zn        -0.121665  0.032919   -3.696  0.000244
## chas      0.046191   0.013673   3.378   0.000787
## nox       2.871873   0.862591   3.329   0.000935
## rm        -18.262427 3.565247   -5.122  4.33e-07
## dis       3.672957   0.409127   8.978   2e-16   
## rad       -1.515951  0.187675   -8.078  5.08e-15
## tax       0.283932   0.063945   4.44    1.11e-05
## ptratio   -0.012292  0.003407   -3.608  0.00034 
## lstat     -0.930961  0.130423   -7.138  3.39e-12
## medv      -0.546509  0.047442   -11.519 2e-16
round(cor(Boston),1)
##         crim   zn indus chas  nox   rm  age  dis  rad  tax ptratio lstat medv
## crim     1.0 -0.2   0.4 -0.1  0.4 -0.2  0.4 -0.4  0.6  0.6     0.3   0.5 -0.4
## zn      -0.2  1.0  -0.5  0.0 -0.5  0.3 -0.6  0.7 -0.3 -0.3    -0.4  -0.4  0.4
## indus    0.4 -0.5   1.0  0.1  0.8 -0.4  0.6 -0.7  0.6  0.7     0.4   0.6 -0.5
## chas    -0.1  0.0   0.1  1.0  0.1  0.1  0.1 -0.1  0.0  0.0    -0.1  -0.1  0.2
## nox      0.4 -0.5   0.8  0.1  1.0 -0.3  0.7 -0.8  0.6  0.7     0.2   0.6 -0.4
## rm      -0.2  0.3  -0.4  0.1 -0.3  1.0 -0.2  0.2 -0.2 -0.3    -0.4  -0.6  0.7
## age      0.4 -0.6   0.6  0.1  0.7 -0.2  1.0 -0.7  0.5  0.5     0.3   0.6 -0.4
## dis     -0.4  0.7  -0.7 -0.1 -0.8  0.2 -0.7  1.0 -0.5 -0.5    -0.2  -0.5  0.2
## rad      0.6 -0.3   0.6  0.0  0.6 -0.2  0.5 -0.5  1.0  0.9     0.5   0.5 -0.4
## tax      0.6 -0.3   0.7  0.0  0.7 -0.3  0.5 -0.5  0.9  1.0     0.5   0.5 -0.5
## ptratio  0.3 -0.4   0.4 -0.1  0.2 -0.4  0.3 -0.2  0.5  0.5     1.0   0.4 -0.5
## lstat    0.5 -0.4   0.6 -0.1  0.6 -0.6  0.6 -0.5  0.5  0.5     0.4   1.0 -0.7
## medv    -0.4  0.4  -0.5  0.2 -0.4  0.7 -0.4  0.2 -0.4 -0.5    -0.5  -0.7  1.0
x = subset(Boston, select = - medv)
for (col in names(x)) {
  plot(x = x[[col]]-1, y = Boston$medv,
    xlab = col,
    ylab = "medv",
    main = "x vs y"
)
}

best.model = regsubsets(medv~., data = Boston, nbest=1, nvmax=12)
summary(best.model)
## Subset selection object
## Call: regsubsets.formula(medv ~ ., data = Boston, nbest = 1, nvmax = 12)
## 12 Variables  (and intercept)
##         Forced in Forced out
## crim        FALSE      FALSE
## zn          FALSE      FALSE
## indus       FALSE      FALSE
## chas        FALSE      FALSE
## nox         FALSE      FALSE
## rm          FALSE      FALSE
## age         FALSE      FALSE
## dis         FALSE      FALSE
## rad         FALSE      FALSE
## tax         FALSE      FALSE
## ptratio     FALSE      FALSE
## lstat       FALSE      FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
##           crim zn  indus chas nox rm  age dis rad tax ptratio lstat
## 1  ( 1 )  " "  " " " "   " "  " " " " " " " " " " " " " "     "*"  
## 2  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " " "     "*"  
## 3  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " "*"     "*"  
## 4  ( 1 )  " "  " " " "   " "  " " "*" " " "*" " " " " "*"     "*"  
## 5  ( 1 )  " "  " " " "   " "  "*" "*" " " "*" " " " " "*"     "*"  
## 6  ( 1 )  " "  " " " "   "*"  "*" "*" " " "*" " " " " "*"     "*"  
## 7  ( 1 )  " "  "*" " "   "*"  "*" "*" " " "*" " " " " "*"     "*"  
## 8  ( 1 )  "*"  "*" " "   "*"  "*" "*" " " "*" " " " " "*"     "*"  
## 9  ( 1 )  "*"  "*" " "   " "  "*" "*" " " "*" "*" "*" "*"     "*"  
## 10  ( 1 ) "*"  "*" " "   "*"  "*" "*" " " "*" "*" "*" "*"     "*"  
## 11  ( 1 ) "*"  "*" " "   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"  
## 12  ( 1 ) "*"  "*" "*"   "*"  "*" "*" "*" "*" "*" "*" "*"     "*"
n = dim(Boston)[1]
set.seed(1)
train_index = sample(1:n,n/2,rep=FALSE)

train = Boston[train_index,]
test = Boston[-train_index,]

val.errors = rep(NA,12)
for(i in 1:12){
  test.mat = model.matrix(medv~.,data=Boston)
  
  coef.m = coef(best.model,id=i)
  
  pred = test.mat[,names(coef.m)]%*%coef.m
  val.errors[i] = mean((Boston$medv)^2)
}

regfitt = regsubsets(medv~., data = Boston, nbest = 1, nvmax = 12)

regfitt.sum = summary(regfitt)

p = rowSums(regfitt.sum$which)
adjr2 = regfitt.sum$adjr2
cp = regfitt.sum$cp
rss = regfitt.sum$rss
AIC = n*log(rss/n) + 2*(p)
BIC = n*log(rss/n) + (p)*log(n)
cbind(rss,AIC,BIC,adjr2,cp)
##         rss      AIC      BIC     adjr2         cp
## 1  19472.38 1851.009 1859.462 0.5432418 343.848074
## 2  15439.31 1735.577 1748.256 0.6371245 170.658081
## 3  13727.99 1678.131 1695.038 0.6767036  98.320999
## 4  13228.91 1661.393 1682.526 0.6878351  78.641892
## 5  12469.34 1633.473 1658.832 0.7051702  47.647706
## 6  12141.07 1621.973 1651.559 0.7123567  35.388139
## 7  11976.67 1617.075 1650.887 0.7156820  30.246610
## 8  11805.76 1611.802 1649.841 0.7191751  24.822922
## 9  11606.40 1605.184 1647.450 0.7233609  18.162742
## 10 11352.19 1595.978 1642.470 0.7288734   9.120223
## 11 11350.50 1597.903 1648.622 0.7283649  11.046965
## 12 11349.42 1599.855 1654.800 0.7278399  13.000000
which.min(AIC)
## 10 
## 10
which.min(BIC)
## 10 
## 10
which.min(cp)
## [1] 10
which.max(adjr2)
## [1] 10
coef(regfitt,10)
##  (Intercept)         crim           zn         chas          nox           rm 
##  41.45174748  -0.12166488   0.04619119   2.87187265 -18.26242664   3.67295747 
##          dis          rad          tax      ptratio        lstat 
##  -1.51595105   0.28393226  -0.01229150  -0.93096144  -0.54650916

1.b Since in class we discussed not removing any variables if possible I check for indepenancy and found it so I chose to run the best model selection with all the independent variables and all but chas had linear relationships with medv. After running the best model selection I kept the chas variable since it showed up in half of the best models.

1.c After that i check to see which model had the lowest Aic and Bic aswell as cp and r^2 and model ten was picked each time.

m10 = lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat, data=Boston)
plot(m10)

m11 = lm(medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + lstat + age, data=Boston)
plot(m11)

1.d we assume a linear relationship between the independet variable and the depedent and got it for most of them as well as check for independency aswell as checking the plots to make sure there were no patterns for normality.

1.e For this I checed multicollinearity but didnt really any varibble with too high of correlation.

2.a I would assume the the best subset model will have the smallest training MSE because it makes a model for each value from 1 to the amount of predictor and return the model with the smallest rss which will eventually get the small rss possible. Since mse = rss/n we can assume that the best selection will give the smallest MSE.

2.b Best subset will be able to go through more models and pick a lower training mse but is more prone to overfitting causing the test mse to rise how still have a better chance at finding lower test mse since it pick between more models

2.c Yes they lead me to pick the same model. The best model for AIC was 12 on both 1795991.

head(College)
##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55
n = dim(College)[1]
set.seed(1)
train_i = sample(1:n,n*.9,rep=FALSE)

train = College[train_i,]
test = College[-train_i,]

regfit.fwd = regsubsets(Apps~.,data=College,nvmax=17, method="forward")
regfit.bwd = regsubsets(Apps~.,data=College,nvmax=17, method="backward")

regfit.fwd.sum = summary(regfit.fwd)
names(regfit.fwd.sum)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
n = dim(College)[1]
p = rowSums(regfit.fwd.sum$which) #number of predictors + intercept in the model 
adjr2 = regfit.fwd.sum$adjr2
cp = regfit.fwd.sum$cp
rss = regfit.fwd.sum$rss
AIC = n*log(rss/n) + 2*(p)
BIC = n*log(rss/n) + (p)*log(n)

which.min(AIC)
## 12 
## 12
which.min(BIC)
## 10 
## 10
which.max(adjr2)
## [1] 13
which.min(cp)
## [1] 12
regfit.bwd.sum = summary(regfit.bwd)
names(regfit.bwd.sum)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
nb = dim(College)[1]
pb = rowSums(regfit.bwd.sum$which)
adjr2b = regfit.bwd.sum$adjr2
cpb = regfit.bwd.sum$cp
rssb = regfit.bwd.sum$rss
AICb = n*log(rss/nb) + 2*(pb)
BICb = n*log(rss/nb) + (pb)*log(nb)

which.min(AICb)
## 12 
## 12
which.min(BICb)
## 10 
## 10
which.max(adjr2b)
## [1] 13
which.min(cpb)
## [1] 12
model0 = lm(Apps~1,data=College)
summary(model0)
## 
## Call:
## lm(formula = Apps ~ 1, data = College)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -2921  -2226  -1444    622  45092 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3001.6      138.8   21.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3870 on 776 degrees of freedom
modelfull = lm(Apps~.,data=College)
summary(modelfull)
## 
## Call:
## lm(formula = Apps ~ ., data = College)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4908.8  -430.2   -29.5   322.3  7852.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -445.08413  408.32855  -1.090 0.276053    
## PrivateYes  -494.14897  137.81191  -3.586 0.000358 ***
## Accept         1.58581    0.04074  38.924  < 2e-16 ***
## Enroll        -0.88069    0.18596  -4.736 2.60e-06 ***
## Top10perc     49.92628    5.57824   8.950  < 2e-16 ***
## Top25perc    -14.23448    4.47914  -3.178 0.001543 ** 
## F.Undergrad    0.05739    0.03271   1.754 0.079785 .  
## P.Undergrad    0.04445    0.03214   1.383 0.167114    
## Outstate      -0.08587    0.01906  -4.506 7.64e-06 ***
## Room.Board     0.15103    0.04829   3.127 0.001832 ** 
## Books          0.02090    0.23841   0.088 0.930175    
## Personal       0.03110    0.06308   0.493 0.622060    
## PhD           -8.67850    4.63814  -1.871 0.061714 .  
## Terminal      -3.33066    5.09494  -0.654 0.513492    
## S.F.Ratio     15.38961   13.00622   1.183 0.237081    
## perc.alumni    0.17867    4.10230   0.044 0.965273    
## Expend         0.07790    0.01235   6.308 4.79e-10 ***
## Grad.Rate      8.66763    2.94893   2.939 0.003390 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1041 on 759 degrees of freedom
## Multiple R-squared:  0.9292, Adjusted R-squared:  0.9276 
## F-statistic: 585.9 on 17 and 759 DF,  p-value: < 2.2e-16
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
## 
##     Boston
?stepAIC

stepAIC(model0,scope=list(lower=model0,upper=modelfull),direction="forward")
## Start:  AIC=12838.69
## Apps ~ 1
## 
##               Df  Sum of Sq        RSS   AIC
## + Accept       1 1.0346e+10 1.2774e+09 11125
## + Enroll       1 8.3351e+09 3.2881e+09 11860
## + F.Undergrad  1 7.7108e+09 3.9125e+09 11995
## + Private      1 2.1701e+09 9.4531e+09 12680
## + P.Undergrad  1 1.8436e+09 9.7797e+09 12706
## + PhD          1 1.7742e+09 9.8491e+09 12712
## + Terminal     1 1.5869e+09 1.0036e+10 12727
## + Top25perc    1 1.4372e+09 1.0186e+10 12738
## + Top10perc    1 1.3344e+09 1.0289e+10 12746
## + Expend       1 7.8327e+08 1.0840e+10 12786
## + Personal     1 3.7130e+08 1.1252e+10 12816
## + Room.Board   1 3.1621e+08 1.1307e+10 12819
## + Grad.Rate    1 2.5033e+08 1.1373e+10 12824
## + Books        1 2.0424e+08 1.1419e+10 12827
## + S.F.Ratio    1 1.0630e+08 1.1517e+10 12834
## + perc.alumni  1 9.4622e+07 1.1529e+10 12834
## <none>                      1.1623e+10 12839
## + Outstate     1 2.9243e+07 1.1594e+10 12839
## 
## Step:  AIC=11124.94
## Apps ~ Accept
## 
##               Df Sum of Sq        RSS   AIC
## + Top10perc    1 298543648  978867162 10920
## + Expend       1 237832439 1039578371 10967
## + Top25perc    1 172865433 1104545378 11014
## + Grad.Rate    1  80919712 1196491098 11076
## + Room.Board   1  73480089 1203930722 11081
## + Outstate     1  64480760 1212930051 11087
## + S.F.Ratio    1  59842954 1217567857 11090
## + perc.alumni  1  43974982 1233435829 11100
## + PhD          1  40339305 1237071506 11102
## + Terminal     1  34118294 1243292517 11106
## + Enroll       1  12102492 1265308319 11120
## + Books        1   7628546 1269782265 11122
## + F.Undergrad  1   5226771 1272184040 11124
## + P.Undergrad  1   4704068 1272706743 11124
## + Private      1   3980419 1273430392 11124
## <none>                     1277410811 11125
## + Personal     1   1436992 1275973818 11126
## 
## Step:  AIC=10920.1
## Apps ~ Accept + Top10perc
## 
##               Df Sum of Sq       RSS   AIC
## + Expend       1  29658293 949208869 10898
## + Top25perc    1  22836110 956031052 10904
## + Enroll       1  13912570 964954593 10911
## + Private      1  10668209 968198953 10914
## + PhD          1   7596359 971270804 10916
## + Room.Board   1   6159533 972707629 10917
## + Outstate     1   5782676 973084487 10918
## + Terminal     1   5767400 973099762 10918
## + perc.alumni  1   5577199 973289963 10918
## + P.Undergrad  1   2567814 976299348 10920
## <none>                     978867162 10920
## + F.Undergrad  1   1718371 977148792 10921
## + Personal     1   1404273 977462889 10921
## + Books        1   1098190 977768973 10921
## + Grad.Rate    1    315353 978551809 10922
## + S.F.Ratio    1     73757 978793406 10922
## 
## Step:  AIC=10898.2
## Apps ~ Accept + Top10perc + Expend
## 
##               Df Sum of Sq       RSS   AIC
## + Outstate     1  34037615 915171254 10872
## + Private      1  21921323 927287546 10882
## + Top25perc    1  14720207 934488662 10888
## + PhD          1  12430477 936778392 10890
## + Terminal     1  11956757 937252112 10890
## + perc.alumni  1  11589036 937619833 10891
## + Enroll       1   8007593 941201276 10894
## + S.F.Ratio    1   7737137 941471733 10894
## + P.Undergrad  1   2918539 946290330 10898
## <none>                     949208869 10898
## + Personal     1   2112443 947096426 10898
## + Books        1    637723 948571146 10900
## + Room.Board   1    264603 948944266 10900
## + F.Undergrad  1     52599 949156270 10900
## + Grad.Rate    1      1172 949207697 10900
## 
## Step:  AIC=10871.82
## Apps ~ Accept + Top10perc + Expend + Outstate
## 
##               Df Sum of Sq       RSS   AIC
## + Enroll       1  29010663 886160591 10849
## + Room.Board   1  16274538 898896716 10860
## + Top25perc    1  10654640 904516614 10865
## + F.Undergrad  1   8296040 906875214 10867
## + PhD          1   7700769 907470485 10867
## + Grad.Rate    1   7321591 907849663 10868
## + Terminal     1   6021988 909149266 10869
## + Private      1   3410109 911761145 10871
## <none>                     915171254 10872
## + S.F.Ratio    1   2129211 913042043 10872
## + perc.alumni  1   1979846 913191408 10872
## + P.Undergrad  1    318048 914853206 10874
## + Books        1    270308 914900946 10874
## + Personal     1     51356 915119898 10874
## 
## Step:  AIC=10848.79
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll
## 
##               Df Sum of Sq       RSS   AIC
## + Room.Board   1  11466506 874694084 10841
## + Top25perc    1  11359437 874801154 10841
## + Private      1   9551354 876609237 10842
## + F.Undergrad  1   6287657 879872934 10845
## + PhD          1   5184017 880976573 10846
## + Grad.Rate    1   4977276 881183315 10846
## + P.Undergrad  1   4196016 881964575 10847
## + Terminal     1   3667705 882492885 10848
## + S.F.Ratio    1   3471226 882689365 10848
## <none>                     886160591 10849
## + perc.alumni  1   1150617 885009974 10850
## + Personal     1    364916 885795675 10850
## + Books        1    346795 885813795 10850
## 
## Step:  AIC=10840.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board
## 
##               Df Sum of Sq       RSS   AIC
## + Top25perc    1  11838452 862855633 10832
## + Private      1  10278235 864415849 10834
## + PhD          1   6633253 868060832 10837
## + Terminal     1   5679065 869015019 10838
## + F.Undergrad  1   5218200 869475884 10838
## + Grad.Rate    1   3809312 870884773 10839
## + S.F.Ratio    1   3268726 871425358 10840
## + P.Undergrad  1   2771544 871922540 10840
## <none>                     874694084 10841
## + Personal     1    483131 874210954 10842
## + perc.alumni  1    430235 874263849 10842
## + Books        1     32071 874662014 10843
## 
## Step:  AIC=10832.09
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc
## 
##               Df Sum of Sq       RSS   AIC
## + Private      1  11486640 851368993 10824
## + F.Undergrad  1   6829025 856026608 10828
## + Grad.Rate    1   5022152 857833480 10830
## + PhD          1   4164868 858690765 10830
## + S.F.Ratio    1   3313460 859542173 10831
## + P.Undergrad  1   3150967 859704666 10831
## + Terminal     1   2866270 859989362 10832
## <none>                     862855633 10832
## + Personal     1    424835 862430798 10834
## + perc.alumni  1    111184 862744449 10834
## + Books        1     56442 862799190 10834
## 
## Step:  AIC=10823.67
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private
## 
##               Df Sum of Sq       RSS   AIC
## + PhD          1  10749717 840619277 10816
## + Terminal     1   8264997 843103997 10818
## + Grad.Rate    1   6882166 844486827 10819
## + F.Undergrad  1   3793524 847575469 10822
## <none>                     851368993 10824
## + P.Undergrad  1   1682974 849686020 10824
## + S.F.Ratio    1   1313916 850055078 10824
## + Personal     1    283343 851085650 10825
## + Books        1    119956 851249037 10826
## + perc.alumni  1      3141 851365852 10826
## 
## Step:  AIC=10815.8
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD
## 
##               Df Sum of Sq       RSS   AIC
## + Grad.Rate    1   7349094 833270183 10811
## + F.Undergrad  1   4405096 836214181 10814
## + P.Undergrad  1   2451457 838167820 10816
## <none>                     840619277 10816
## + S.F.Ratio    1   1808849 838810427 10816
## + Terminal     1    487207 840132070 10817
## + Personal     1    284996 840334280 10818
## + perc.alumni  1     78065 840541211 10818
## + Books        1      3237 840616040 10818
## 
## Step:  AIC=10810.98
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate
## 
##               Df Sum of Sq       RSS   AIC
## + F.Undergrad  1   5704713 827565469 10808
## + P.Undergrad  1   4323227 828946955 10809
## <none>                     833270183 10811
## + S.F.Ratio    1   1711005 831559177 10811
## + Personal     1    841830 832428353 10812
## + Terminal     1    352500 832917683 10813
## + perc.alumni  1    117245 833152938 10813
## + Books        1     58008 833212174 10813
## 
## Step:  AIC=10807.64
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate + F.Undergrad
## 
##               Df Sum of Sq       RSS   AIC
## + P.Undergrad  1   2248227 825317242 10808
## <none>                     827565469 10808
## + S.F.Ratio    1   1437928 826127541 10808
## + Terminal     1    515424 827050045 10809
## + Personal     1    426703 827138766 10809
## + perc.alumni  1     39675 827525794 10810
## + Books        1     25040 827540429 10810
## 
## Step:  AIC=10807.53
## Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + 
##     Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad
## 
##               Df Sum of Sq       RSS   AIC
## <none>                     825317242 10808
## + S.F.Ratio    1   1485954 823831288 10808
## + Terminal     1    513811 824803431 10809
## + Personal     1    227696 825089547 10809
## + perc.alumni  1     20671 825296571 10810
## + Books        1     13607 825303635 10810
## 
## Call:
## lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate + 
##     Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate + 
##     F.Undergrad + P.Undergrad, data = College)
## 
## Coefficients:
## (Intercept)       Accept    Top10perc       Expend     Outstate       Enroll  
##  -157.28686      1.58691     50.41132      0.07247     -0.09018     -0.88265  
##  Room.Board    Top25perc   PrivateYes          PhD    Grad.Rate  F.Undergrad  
##     0.14777    -14.74735   -511.78760    -10.70503      8.63961      0.05945  
## P.Undergrad  
##     0.04593
stepAIC(modelfull,scope=list(lower=model0,upper=modelfull),direction="backward")
## Start:  AIC=10815.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + perc.alumni + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## - perc.alumni  1       2057  823062005 10813
## - Books        1       8332  823068280 10813
## - Personal     1     263707  823323655 10814
## - Terminal     1     463415  823523363 10814
## - S.F.Ratio    1    1518247  824578195 10815
## - P.Undergrad  1    2073704  825133652 10815
## <none>                       823059948 10815
## - F.Undergrad  1    3337258  826397207 10816
## - PhD          1    3796560  826856508 10817
## - Grad.Rate    1    9368302  832428250 10822
## - Room.Board   1   10605426  833665374 10823
## - Top25perc    1   10951733  834011681 10824
## - Private      1   13942221  837002170 10826
## - Outstate     1   22020341  845080289 10834
## - Enroll       1   24321652  847381600 10836
## - Expend       1   43151679  866211628 10853
## - Top10perc    1   86866642  909926590 10891
## - Accept       1 1642984489 2466044437 11666
## 
## Step:  AIC=10813.4
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Books + Personal + 
##     PhD + Terminal + S.F.Ratio + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## - Books        1       7989  823069994 10811
## - Personal     1     261699  823323704 10812
## - Terminal     1     461438  823523443 10812
## - S.F.Ratio    1    1517299  824579304 10813
## - P.Undergrad  1    2071755  825133760 10813
## <none>                       823062005 10813
## - F.Undergrad  1    3335997  826398002 10814
## - PhD          1    3799154  826861159 10815
## - Grad.Rate    1    9830950  832892955 10821
## - Room.Board   1   10817132  833879137 10822
## - Top25perc    1   10979163  834041168 10822
## - Private      1   14029065  837091070 10824
## - Outstate     1   22841086  845903092 10833
## - Enroll       1   24505771  847567776 10834
## - Expend       1   43192465  866254470 10851
## - Top10perc    1   86934199  909996204 10889
## - Accept       1 1686807594 2509869599 11678
## 
## Step:  AIC=10811.41
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + Personal + PhD + Terminal + 
##     S.F.Ratio + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## - Personal     1     286484  823356478 10810
## - Terminal     1     453536  823523530 10810
## - S.F.Ratio    1    1523776  824593770 10811
## - P.Undergrad  1    2073781  825143774 10811
## <none>                       823069994 10811
## - F.Undergrad  1    3337954  826407948 10813
## - PhD          1    3936953  827006947 10813
## - Grad.Rate    1    9823693  832893687 10819
## - Top25perc    1   10971468  834041462 10820
## - Room.Board   1   11052994  834122988 10820
## - Private      1   14021861  837091855 10822
## - Outstate     1   22934693  846004687 10831
## - Enroll       1   24507814  847577808 10832
## - Expend       1   43299794  866369788 10849
## - Top10perc    1   87145523  910215517 10888
## - Accept       1 1687626727 2510696721 11676
## 
## Step:  AIC=10809.68
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + Terminal + S.F.Ratio + 
##     Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## - Terminal     1     474810  823831288 10808
## - S.F.Ratio    1    1446954  824803431 10809
## <none>                       823356478 10810
## - P.Undergrad  1    2294055  825650532 10810
## - F.Undergrad  1    3524082  826880559 10811
## - PhD          1    3903516  827259993 10811
## - Grad.Rate    1    9576067  832932544 10817
## - Room.Board   1   10948047  834304524 10818
## - Top25perc    1   11009477  834365954 10818
## - Private      1   14045232  837401710 10821
## - Outstate     1   23757582  847114060 10830
## - Enroll       1   24529642  847886120 10830
## - Expend       1   43741282  867097760 10848
## - Top10perc    1   87332619  910689096 10886
## - Accept       1 1688998295 2512354773 11674
## 
## Step:  AIC=10808.13
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + S.F.Ratio + Expend + 
##     Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## - S.F.Ratio    1    1485954  825317242 10808
## <none>                       823831288 10808
## - P.Undergrad  1    2296253  826127541 10808
## - F.Undergrad  1    3402201  827233489 10809
## - Grad.Rate    1    9725730  833557018 10815
## - Room.Board   1   10580678  834411966 10816
## - Top25perc    1   11852393  835683681 10817
## - PhD          1   13174760  837006048 10818
## - Private      1   13675900  837507188 10819
## - Enroll       1   24420572  848251860 10829
## - Outstate     1   24881049  848712337 10829
## - Expend       1   43404484  867235772 10846
## - Top10perc    1   89940095  913771383 10887
## - Accept       1 1691950930 2515782218 11674
## 
## Step:  AIC=10807.53
## Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + 
##     P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate
## 
##               Df  Sum of Sq        RSS   AIC
## <none>                       825317242 10808
## - P.Undergrad  1    2248227  827565469 10808
## - F.Undergrad  1    3629713  828946955 10809
## - Grad.Rate    1    9850583  835167825 10815
## - Room.Board   1   10699017  836016260 10816
## - Top25perc    1   12037817  837355059 10817
## - PhD          1   12708568  838025810 10817
## - Private      1   15691081  841008323 10820
## - Enroll       1   24676722  849993965 10828
## - Outstate     1   26201946  851519188 10830
## - Expend       1   43734225  869051468 10846
## - Top10perc    1   89928332  915245574 10886
## - Accept       1 1696846612 2522163854 11674
## 
## Call:
## lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + 
##     F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD + 
##     Expend + Grad.Rate, data = College)
## 
## Coefficients:
## (Intercept)   PrivateYes       Accept       Enroll    Top10perc    Top25perc  
##  -157.28686   -511.78760      1.58691     -0.88265     50.41132    -14.74735  
## F.Undergrad  P.Undergrad     Outstate   Room.Board          PhD       Expend  
##     0.05945      0.04593     -0.09018      0.14777    -10.70503      0.07247  
##   Grad.Rate  
##     8.63961
model_train = lm(formula = Apps ~ Private + Accept + Enroll + Top10perc + Top25perc + F.Undergrad + P.Undergrad + Outstate + Room.Board + PhD + Expend + Grad.Rate, data = College)

MSE_train = mean((train$Apps - model_train$fitted.values)^2) 
## Warning in train$Apps - model_train$fitted.values: longer object length is not a
## multiple of shorter object length
MSE_train
## [1] 28530514
predicted_values = predict(model_train,test) 
MSE_test = mean((test$Apps - predicted_values)^2)
MSE_test
## [1] 1795991
model_trainb = lm(formula = Apps ~ Accept + Top10perc + Expend + Outstate + Enroll + Room.Board + Top25perc + Private + PhD + Grad.Rate + F.Undergrad + P.Undergrad, data = College)

MSE_trainb = mean((train$Apps - model_trainb$fitted.values)^2) 
## Warning in train$Apps - model_trainb$fitted.values: longer object length is not
## a multiple of shorter object length
MSE_trainb
## [1] 28530514
predicted_valuesb = predict(model_trainb,test) 
MSE_testb = mean((test$Apps - predicted_valuesb)^2)
MSE_testb
## [1] 1795991
  1. In this graph we see that even though the two variable are not statistically significants we still see a very small p vale over all with high std error.

4.a

Credit$Own = factor(Credit$Own)
Credit$Student= factor(Credit$Student)
Credit$Married = factor(Credit$Married)
Credit$Region = factor(Credit$Region)
head(Credit)
##    Income Limit Rating Cards Age Education Own Student Married Region Balance
## 1  14.891  3606    283     2  34        11  No      No     Yes  South     333
## 2 106.025  6645    483     3  82        15 Yes     Yes     Yes   West     903
## 3 104.593  7075    514     4  71        11  No      No      No   West     580
## 4 148.924  9504    681     3  36        11 Yes      No      No   West     964
## 5  55.882  4897    357     2  68        16  No      No     Yes  South     331
## 6  80.180  8047    569     4  77        10  No      No      No  South    1151

4.b

fit = lm(Balance~ Income+ Student,data=Credit)
summary(fit)
## 
## Call:
## lm(formula = Balance ~ Income + Student, data = Credit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -762.37 -331.38  -45.04  323.60  818.28 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 211.1430    32.4572   6.505 2.34e-10 ***
## Income        5.9843     0.5566  10.751  < 2e-16 ***
## StudentYes  382.6705    65.3108   5.859 9.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.8 on 397 degrees of freedom
## Multiple R-squared:  0.2775, Adjusted R-squared:  0.2738 
## F-statistic: 76.22 on 2 and 397 DF,  p-value: < 2.2e-16

4.c Student: Yhat = 593.8135 + 5.9843Income Non Student: Yhat = 211.1430 + 5.9843Income

4.d for both A one unit increase in income is associated with an increase of 5.9843 units in credit, controlling for the other predictors.

4.e When looking at the plot below we can see that it doesnt really make sense the Income grows at the same rate for student and non student

library(ggplot2)

ggplot(Credit, aes(x = Income, y = Balance, color = Student)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

4.f Student: Yhat = 677.299 + 4.2188Income Non Student: Yhat = 200.6232 + 6.2182Income

summary(lm(Balance ~ Income + Student + Income:Student, data=Credit))
## 
## Call:
## lm(formula = Balance ~ Income + Student + Income:Student, data = Credit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -773.39 -325.70  -41.13  321.65  814.04 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       200.6232    33.6984   5.953 5.79e-09 ***
## Income              6.2182     0.5921  10.502  < 2e-16 ***
## StudentYes        476.6758   104.3512   4.568 6.59e-06 ***
## Income:StudentYes  -1.9992     1.7313  -1.155    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 391.6 on 396 degrees of freedom
## Multiple R-squared:  0.2799, Adjusted R-squared:  0.2744 
## F-statistic:  51.3 on 3 and 396 DF,  p-value: < 2.2e-16

4.g Student: A one unit increase in income is associated with an increase of 4.2188 units in credit, controlling for the other predictors. NonStudent: A one unit increase in income is associated with an increase of 6.2182 units in credit, controlling for the other predictors.